{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "ANgU-gBQC_VP"
   },
   "source": [
    "\n",
    "# PyTorch-Transformers\n",
    "\n",
    "PyStruct-Transformers is a library developed by the HuggingFace Team that contains PyTorch implementations of popular NLP Transformers. \n",
    "\n",
    "\n",
    "## Installation ##\n",
    "\n",
    "The required packages to run the library can be installed with the following command:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Rs55LyyuC_VS",
    "outputId": "e278d03c-c465-4e3a-edea-71862a9fedf9"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: tqdm in /anaconda/lib/python3.6/site-packages (4.61.2)\n",
      "Requirement already satisfied: boto3 in /anaconda/lib/python3.6/site-packages (1.17.64)\n",
      "Requirement already satisfied: requests in /anaconda/lib/python3.6/site-packages (2.25.1)\n",
      "Requirement already satisfied: regex in /anaconda/lib/python3.6/site-packages (2021.4.4)\n",
      "Requirement already satisfied: sentencepiece in /anaconda/lib/python3.6/site-packages (0.1.95)\n",
      "Requirement already satisfied: sacremoses in /anaconda/lib/python3.6/site-packages (0.0.45)\n",
      "Requirement already satisfied: transformers in /anaconda/lib/python3.6/site-packages (4.8.2)\n",
      "Requirement already satisfied: s3transfer<0.5.0,>=0.4.0 in /anaconda/lib/python3.6/site-packages (from boto3) (0.4.2)\n",
      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /anaconda/lib/python3.6/site-packages (from boto3) (0.10.0)\n",
      "Requirement already satisfied: botocore<1.21.0,>=1.20.64 in /anaconda/lib/python3.6/site-packages (from boto3) (1.20.64)\n",
      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /anaconda/lib/python3.6/site-packages (from botocore<1.21.0,>=1.20.64->boto3) (2.8.2)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /anaconda/lib/python3.6/site-packages (from botocore<1.21.0,>=1.20.64->boto3) (1.26.6)\n",
      "Requirement already satisfied: six>=1.5 in /anaconda/lib/python3.6/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.21.0,>=1.20.64->boto3) (1.16.0)\n",
      "Requirement already satisfied: chardet<5,>=3.0.2 in /anaconda/lib/python3.6/site-packages (from requests) (4.0.0)\n",
      "Requirement already satisfied: idna<3,>=2.5 in /anaconda/lib/python3.6/site-packages (from requests) (2.10)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /anaconda/lib/python3.6/site-packages (from requests) (2021.5.30)\n",
      "Requirement already satisfied: click in /anaconda/lib/python3.6/site-packages (from sacremoses) (7.1.2)\n",
      "Requirement already satisfied: joblib in /anaconda/lib/python3.6/site-packages (from sacremoses) (0.14.1)\n",
      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /anaconda/lib/python3.6/site-packages (from transformers) (0.10.3)\n",
      "Requirement already satisfied: filelock in /anaconda/lib/python3.6/site-packages (from transformers) (3.0.12)\n",
      "Requirement already satisfied: huggingface-hub==0.0.12 in /anaconda/lib/python3.6/site-packages (from transformers) (0.0.12)\n",
      "Requirement already satisfied: numpy>=1.17 in /anaconda/lib/python3.6/site-packages (from transformers) (1.19.2)\n",
      "Requirement already satisfied: importlib-metadata in /anaconda/lib/python3.6/site-packages (from transformers) (3.10.0)\n",
      "Requirement already satisfied: dataclasses in /anaconda/lib/python3.6/site-packages (from transformers) (0.8)\n",
      "Requirement already satisfied: packaging in /anaconda/lib/python3.6/site-packages (from transformers) (21.0)\n",
      "Requirement already satisfied: pyyaml in /anaconda/lib/python3.6/site-packages (from transformers) (5.3.1)\n",
      "Requirement already satisfied: typing-extensions in /anaconda/lib/python3.6/site-packages (from huggingface-hub==0.0.12->transformers) (3.10.0.0)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in /anaconda/lib/python3.6/site-packages (from packaging->transformers) (2.4.7)\n",
      "Requirement already satisfied: zipp>=0.5 in /anaconda/lib/python3.6/site-packages (from importlib-metadata->transformers) (3.5.0)\n"
     ]
    }
   ],
   "source": [
    "%%bash\n",
    "pip install tqdm boto3 requests regex sentencepiece sacremoses transformers"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "85Fc9hz6C_VT"
   },
   "source": [
    "# Example: preprocessing\n",
    "\n",
    "Character strings should be turned into a sequence of tokens to be fed to Transformers. This operation is accomplished by a `Tokenizer`. Each model has its own tokenizer, and some tokenizing methods are different across tokenizers. The complete documentation can be found [here](https://huggingface.co/pytorch-transformers/main_classes/tokenizer.html)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using cache found in /Users/andrea/.cache/torch/hub/huggingface_pytorch-transformers_master\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2627, 1108, 3104, 1124, 15703, 136]\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "# download the tokenizer for a specific model\n",
    "# 'bert-base-cased' is a base BERT Transformer pre-trained on cased English text.\n",
    "tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased')\n",
    "\n",
    "text_1 = \"Who was Jim Henson ?\"\n",
    "text_2 = \"Jim Henson was a puppeteer\"\n",
    "\n",
    "# Tokenize input with `encode`. Note that tokenization sometimes split words (e.g. surnames)\n",
    "print(tokenizer.encode(text_1, add_special_tokens=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "He\n",
      "##nson\n",
      "?\n"
     ]
    }
   ],
   "source": [
    "# Detokenize single tokens with `decode`\n",
    "print(tokenizer.decode(1124))\n",
    "print(tokenizer.decode(15703))\n",
    "print(tokenizer.decode(136))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Who was Jim Henson?\n"
     ]
    }
   ],
   "source": [
    "# Detokenize a sequence of tokens \n",
    "print(tokenizer.decode(tokenizer.encode(text_1, add_special_tokens=False)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[101, 2627, 1108, 3104, 1124, 15703, 136, 102, 3104, 1124, 15703, 1108, 170, 16797, 8284, 102]\n"
     ]
    }
   ],
   "source": [
    "# encode allows to also specify pairs of sentences. \n",
    "# Special tokens can also be added around sequences \n",
    "# (for BERT: [CLS] at the beginning and [SEP] at the end)\n",
    "print(tokenizer.encode(text_1, text_2, add_special_tokens=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "8nmAoPK9C_Vg"
   },
   "source": [
    "## Example: Predict a missing (masked) word in a sequence "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {
    "id": "sGLAr72xC_Vh"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Jim\n"
     ]
    }
   ],
   "source": [
    "text_1 = \"Who was Jim Henson ?\"\n",
    "text_2 = \"Jim Henson was a puppeteer\"\n",
    "indexed_tokens = tokenizer.encode(text_1, text_2, add_special_tokens=True)\n",
    "\n",
    "# Mask a token that we will try to predict back with `BertForMaskedLM`\n",
    "masked_index = 8\n",
    "print(tokenizer.decode(indexed_tokens[masked_index]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[MASK]\n"
     ]
    }
   ],
   "source": [
    "# replace the token to be masked with a special mask_token_id\n",
    "indexed_tokens[masked_index] = tokenizer.mask_token_id\n",
    "tokens_tensor = torch.tensor([indexed_tokens])\n",
    "print(tokenizer.decode(indexed_tokens[masked_index]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using cache found in /Users/andrea/.cache/torch/hub/huggingface_pytorch-transformers_master\n",
      "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n",
      "- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[  101,  2627,  1108,  3104,  1124, 15703,   136,   102,   103,  1124,\n",
      "         15703,  1108,   170, 16797,  8284,   102]])\n",
      "tensor([  119,  2627,  1108,  3104,  1124, 15703,   136,   119,  3104,  1124,\n",
      "        15703,  1108,   170, 16797,   119,   119])\n"
     ]
    }
   ],
   "source": [
    "# load a pre-trained `BertForMaskedLM`, a Bert Model with a language modeling head on top.\n",
    "masked_lm_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForMaskedLM', 'bert-base-cased')\n",
    "\n",
    "with torch.no_grad():\n",
    "    predictions = masked_lm_model(tokens_tensor)\n",
    "\n",
    "# the model predicts a distribution over the token vocabulary for each input token\n",
    "print(tokens_tensor)\n",
    "print(torch.argmax(predictions[0][0], dim=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[CLS] [SEP] .\n"
     ]
    }
   ],
   "source": [
    "# note: special characters are turned into '.' \n",
    "print(tokenizer.decode(101),tokenizer.decode(102),tokenizer.decode(119))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Jim'"
      ]
     },
     "execution_count": 96,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# the prediction for the masked token can be recovered looking at 'masked_index'\n",
    "predicted_token = torch.argmax(predictions[0][0], dim=1)[masked_index]\n",
    "tokenizer.decode(predicted_token)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "UMHkeDHTC_Vh"
   },
   "source": [
    "## Example: question answering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "id": "q_kfUD8kC_Vh"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using cache found in /Users/andrea/.cache/torch/hub/huggingface_pytorch-transformers_master\n",
      "Using cache found in /Users/andrea/.cache/torch/hub/huggingface_pytorch-transformers_master\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 2040, 2001, 3958, 27227, 1029, 102]\n",
      "jim\n",
      "henson\n",
      "puppet\n",
      "##eer\n"
     ]
    }
   ],
   "source": [
    "# load model and tokenizer which are appropriate for question answering\n",
    "question_answering_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForQuestionAnswering', 'bert-large-uncased-whole-word-masking-finetuned-squad')\n",
    "question_answering_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-large-uncased-whole-word-masking-finetuned-squad')\n",
    "\n",
    "# The format is paragraph first and then question\n",
    "text_1 = \"Jim Henson was a puppeteer\"\n",
    "text_2 = \"Who was Jim Henson ?\"\n",
    "indexed_tokens = question_answering_tokenizer.encode(text_1, text_2, add_special_tokens=True)\n",
    "\n",
    "# Note that this tokenizer does not split surnames (but puppeteer still gets split)\n",
    "print(indexed_tokens)\n",
    "print(question_answering_tokenizer.decode(3958))\n",
    "print(question_answering_tokenizer.decode(27227))\n",
    "print(question_answering_tokenizer.decode(13997))\n",
    "print(question_answering_tokenizer.decode(11510))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The model requires to specify the separation between paragraph and question\n",
    "segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n",
    "\n",
    "segments_tensors = torch.tensor([segments_ids])\n",
    "tokens_tensor = torch.tensor([indexed_tokens])\n",
    "\n",
    "# Predict the start and end positions logits\n",
    "with torch.no_grad():\n",
    "    out = question_answering_model(tokens_tensor, token_type_ids=segments_tensors)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[1.2956e-03, 1.3718e-01, 2.0828e-03, 4.6422e-04, 1.2775e-01, 7.2051e-01,\n",
      "         3.5628e-03, 1.2956e-03, 7.7327e-04, 5.0221e-04, 3.0025e-03, 2.2000e-04,\n",
      "         6.7787e-05, 1.2957e-03]])\n",
      "tensor([[1.8842e-02, 2.2303e-03, 9.1058e-03, 3.5248e-04, 1.4338e-03, 6.1204e-03,\n",
      "         9.2058e-01, 1.8841e-02, 2.4989e-04, 2.3315e-04, 2.9274e-04, 2.5887e-03,\n",
      "         2.9281e-04, 1.8833e-02]])\n"
     ]
    }
   ],
   "source": [
    "# The model predicts (the logits of) a distribution over start and end positions \n",
    "# of the answer in the paragraph\n",
    "\n",
    "print(torch.softmax(out.start_logits, dim=1))\n",
    "print(torch.softmax(out.end_logits, dim=1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[13997, 11510]\n",
      "puppeteer\n"
     ]
    }
   ],
   "source": [
    "# get the highest prediction\n",
    "answer_tokens = indexed_tokens[torch.argmax(out.start_logits):torch.argmax(out.end_logits)+1]\n",
    "print(answer_tokens)\n",
    "answer = question_answering_tokenizer.decode(answer_tokens)\n",
    "print(answer)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "F6CbmBxWC_Vh"
   },
   "source": [
    "## Example: predict if a sentence is a paraphrase of another one"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "id": "0J-VzeKuC_Vi"
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using cache found in /Users/andrea/.cache/torch/hub/huggingface_pytorch-transformers_master\n",
      "Using cache found in /Users/andrea/.cache/torch/hub/huggingface_pytorch-transformers_master\n"
     ]
    }
   ],
   "source": [
    "# load a model and tokenizer appropriate for sequence classification\n",
    "sequence_classification_model = torch.hub.load('huggingface/pytorch-transformers', 'modelForSequenceClassification', 'bert-base-cased-finetuned-mrpc')\n",
    "sequence_classification_tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'tokenizer', 'bert-base-cased-finetuned-mrpc')\n",
    "\n",
    "text_1 = \"Jim Henson was a puppeteer\"\n",
    "text_2 = \"Who was Jim Henson ?\"\n",
    "indexed_tokens = sequence_classification_tokenizer.encode(text_1, text_2, add_special_tokens=True)\n",
    "\n",
    "# again, the model needs to know when the second sentence starts\n",
    "segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]\n",
    "\n",
    "segments_tensors = torch.tensor([segments_ids])\n",
    "tokens_tensor = torch.tensor([indexed_tokens])\n",
    "\n",
    "# Predict the sequence classification logits\n",
    "with torch.no_grad():\n",
    "    seq_classif_logits = sequence_classification_model(tokens_tensor, token_type_ids=segments_tensors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 0.9574, -0.2855]])\n"
     ]
    }
   ],
   "source": [
    "print(seq_classif_logits[0])\n",
    "# class 0 means the two sentences are not paraphrasing each other"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[-0.1473,  1.5291]])\n"
     ]
    }
   ],
   "source": [
    "# a positive example\n",
    "text_1 = \"The new movie is great\"\n",
    "text_2 = \"I love the new movie\"\n",
    "indexed_tokens = sequence_classification_tokenizer.encode(text_1, text_2, \n",
    "                                                          add_special_tokens=True)\n",
    "\n",
    "segments_ids = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]\n",
    "\n",
    "segments_tensors = torch.tensor([segments_ids])\n",
    "tokens_tensor = torch.tensor([indexed_tokens])\n",
    "\n",
    "with torch.no_grad():\n",
    "    seq_classif_logits = sequence_classification_model(tokens_tensor, token_type_ids=segments_tensors)\n",
    "\n",
    "print(seq_classif_logits[0])"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "name": "huggingface_pytorch-transformers.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}